import numpy as np
import pandas as pd
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from time import time
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import datasets, linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2, f_regression,mutual_info_regression
sns.set(style="whitegrid", color_codes=True)
%matplotlib inline
try:
data = pd.read_csv("survey_results_public.csv")
print "O dataset da pesquisa contém {} linhas com {} caracteristicas cada.".format(*data.shape)
except:
print "Dataset não encontrado, aquirvo CSV está na pasta?"
dataFinal = data[(data["Country"] == "United States") & (data["EmploymentStatus"] == "Employed full-time") & (data["Currency"] == "U.S. dollars ($)")]
salaryDf = pd.DataFrame(dataFinal["Salary"])
print "Descrição de todos os salary"
display(salaryDf.describe())
q = dataFinal["Salary"].quantile(0.95)
qm = dataFinal["Salary"].quantile(0.05)
print "Descrição dos salays sem os outliers"
display(salaryDf[(salaryDf["Salary"] < q) & (salaryDf["Salary"] > qm)].describe())
#Colocando no dataset os dados sem outliers
dataFinal = dataFinal[(dataFinal["Salary"] < q) & (dataFinal["Salary"] > qm)]
#Pegando todas as features e colocando elas em 3 grupos, as que tem multiplas respostas, respostas simples ou são respostas númericas
features_multiples = ["DeveloperType", "HaveWorkedLanguage", "HaveWorkedDatabase", "HaveWorkedPlatform", "Race", "MetricAssess", "IDE", "Methodology"]
features_normal = ["FormalEducation", "MajorUndergrad", "University", "YearsProgram", "CompanySize", "YearsCodedJob", "Professional", "ProgramHobby", "HomeRemote", "CompanyType", "PronounceGIF", "ProblemSolving", "BuildingThings", "LearningNewTech", "BoringDetails", "JobSecurity", "DiversityImportant", "AnnoyingUI", "FriendsDevelopers", "RightWrongWay", "UnderstandComputers", "SeriousWork", "InvestTimeTools", "WorkPayCare", "KinshipDevelopers", "ChallengeMyself", "CompetePeers", "ChangeWorld", "AuditoryEnvironment", "VersionControl", "CheckInCode", "ShipIt", "OtherPeoplesCode", "EnjoyDebugging", "InTheZone", "DifficultCommunication", "CollaborateRemote", "InfluenceInternet", "InfluenceWorkstation", "InfluenceHardware", "InfluenceServers", "InfluenceTechStack", "InfluenceDeptTech", "InfluenceVizTools", "InfluenceDatabase", "InfluenceCloud", "InfluenceConsultants", "InfluenceRecruitment", "InfluenceCommunication", "Gender", "HighestEducationParents", "SurveyLong"]
features_int = ["CareerSatisfaction", "JobSatisfaction", "HoursPerWeek"]
%pylab inline
pylab.rcParams['figure.figsize'] = (11, 6)
#Analises
for i, columnChart in enumerate(features_normal):
try:
#print columnChart
plt.figure(i)
chart = sns.violinplot(x=columnChart, y="Salary", data=dataFinal)
chart.set_xticklabels(chart.get_xticklabels(), rotation=40, ha="right")
plt.title("Salario x {}".format(columnChart))
plt.tight_layout()
plt.show()
except UnicodeDecodeError:
print columnChart
#Distruibuição do Salary
sns.distplot(pd.DataFrame(dataFinal[dataFinal["Salary"].isnull() == False]["Salary"]))
def get_dummies_skills(X_column):
#Criando uma lista de habilidades. Poderiamos fazer isso somente uma vez e guardar o array para o load ficar mais rapido
skillsList = []
for skills in X_all[X_column]:
skillsSplit = skills.split(";")
for skill in skillsSplit:
skillsList.append(skill.strip())
skillsList = set(skillsList)
#Para cada column de skills, fazemos um dummie, já que são dimensões categorizadas
for skill in skillsList:
column = []
for skills in X_all[X_column]:
skillsSplit = skills.split(";")
skillsSplit = [x.strip(' ') for x in skillsSplit]
if skill.strip() in skillsSplit:
column.append(1)
else:
column.append(0)
X_all[X_column + "_" + skill.strip()] = column
X_all.drop(X_column, axis=1, inplace=True)
def get_dummies_normal(X_collumn):
dataDummies = X_all
dataDummies = dataDummies.join(pd.get_dummies(X_all[X_collumn], prefix=X_collumn))
dataDummies.drop(X_collumn, axis=1, inplace=True)
return dataDummies
def get_dummies_skills_noise(X_column):
#Criando uma lista de habilidades. Poderiamos fazer isso somente uma vez e guardar o array para o load ficar mais rapido
skillsList = []
for skills in X_all_noise[X_column]:
skillsSplit = skills.split(";")
for skill in skillsSplit:
skillsList.append(skill.strip())
skillsList = set(skillsList)
#Para cada column de skills, fazemos um dummie, já que são dimensões categorizadas
for skill in skillsList:
column = []
for skills in X_all_noise[X_column]:
skillsSplit = skills.split(";")
skillsSplit = [x.strip(' ') for x in skillsSplit]
if skill.strip() in skillsSplit:
column.append(1)
else:
column.append(0)
X_all_noise[X_column + "_" + skill.strip()] = column
X_all_noise.drop(X_column, axis=1, inplace=True)
def get_dummies_normal_noise(X_collumn):
dataDummies = X_all_noise
dataDummies = dataDummies.join(pd.get_dummies(X_all_noise[X_collumn], prefix=X_collumn))
dataDummies.drop(X_collumn, axis=1, inplace=True)
return dataDummies
def model(regr, X_train, X_test, y_train, y_test):
# Treinando o modelo
startReg = time()
regr.fit(X_train, y_train)
endReg = time()
print "Tempo de treino do modelo: {:.4f} segundos".format(endReg - startReg)
#Fazendo a previsão do modelo de treino
startPred = time()
y_pred = regr.predict(X_train)
endPred = time()
# The mean squared error
print("\nMean squared error (Treino): %.2f"
% mean_squared_error(y_train, y_pred))
# Explained variance score: 1 is perfect prediction
print('R2 Score (Treino): %.2f' % r2_score(y_train, y_pred))
print "Tempo de previsão do modelo de treino: {:.4f} segundos".format(endPred - startPred)
#Fazendo a previsão do modelo
startPred = time()
y_pred = regr.predict(X_test)
endPred = time()
# The mean squared error
print("\nMean squared error: %.2f"
% mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('R2 Score: %.2f' % r2_score(y_test, y_pred))
print "Tempo de previsão do modelo : {:.4f} segundos".format(endPred - startPred)
endAll = time()
print "\nTempo total da operação: {:.4f} segundos".format(endAll - startAll)
startAll = time()
features = features_multiples + features_normal + features_int
columns = features + ["Salary"]
dataBench = dataFinal[columns]
#Quantidade total de usuarios do USA
print "Total de usuários : {}".format(len(dataBench))
for fNull in features_multiples + features_normal:
dataBench[fNull] = dataBench[fNull].fillna("Has no opinion")
#Quantidade final
dataBench = dataBench.dropna(how='any')
print "Total de usuários depois de retirar nulos: {}".format(len(dataBench))
X_all = dataBench[features]
y_all = dataBench["Salary"]
#Normalização
from sklearn import preprocessing
Y_all_origin = y_all
y_all = preprocessing.robust_scale(y_all.reshape(-1, 1))
#Fazendos os dummies
for feature_multiple in features_multiples:
get_dummies_skills(feature_multiple)
for feature_normal in features_normal:
X_all = get_dummies_normal(feature_normal)
print "Contém {} linhas com {} caracteristicas cada.".format(*X_all.shape)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=0.7, random_state=42)
print "O conjunto de treinamento tem {} amostras.".format(X_train.shape[0])
print "O conjunto de teste tem {} amostras.".format(X_test.shape[0])
#Fazendo a regressão de benchmark
model(linear_model.BayesianRidge(), X_train, X_test, y_train, y_test)
y_mean=np.empty(len(y_all)).reshape(-1,1)
y_mean.fill(y_all.mean())
print("\nMean squared error with mean: %.2f"
%mean_squared_error(y_all, y_mean))
print('R2 Score with mean: %.2f' % r2_score(y_all, y_mean))
features_int = []
features = features_normal + features_multiples + features_int
columns = features + ["Salary"]
dataFinal = dataFinal[columns]
#Fazemos o replace de algumas features que contém dados nulos
#dataFinal["MajorUndergrad"] = dataFinal["MajorUndergrad"].fillna("Has no graduation")
#dataFinal["HaveWorkedLanguage"] = dataFinal["HaveWorkedLanguage"].fillna("Has no skill")
#dataFinal["HaveWorkedDatabase"] = dataFinal["HaveWorkedDatabase"].fillna("Has no skill")
#dataFinal["HaveWorkedPlatform"] = dataFinal["HaveWorkedPlatform"].fillna("Has no skill")
for fNull in features_multiples + features_normal:
dataFinal[fNull] = dataFinal[fNull].fillna("Has no opinion")
#Quantidade total de usuarios do USA
print "Total de usuários : {}".format(len(dataFinal))
#Descrevemos a quantidade de nulos em cada feature
#for feature in columns:
# print "Categoria {} tem {} nulos".format(feature, len(dataFinal[dataFinal[feature].isnull() == True]))
#Quantidade final
dataFinal = dataFinal.dropna(how='any')
print len(dataFinal)
X_all = dataFinal[features]
y_all = dataFinal["Salary"]
#Normalização
Y_all_origin = y_all
y_all = preprocessing.robust_scale(y_all.reshape(-1, 1))
#Fazendos os dummies
for feature_multiple in features_multiples:
get_dummies_skills(feature_multiple)
for feature_normal in features_normal:
X_all = get_dummies_normal(feature_normal)
pipe = Pipeline([
('reduce_dim', PCA()),
('classify', linear_model.BayesianRidge())
])
N_FEATURES_OPTIONS = [10,20,40,80,150,200]
MAX_DEPTH = [1, 2, 3]
param_grid = [
{
'reduce_dim': [PCA(iterated_power=7), NMF()],
'reduce_dim__n_components': N_FEATURES_OPTIONS,
},
{
'reduce_dim': [SelectKBest(f_regression)],
'reduce_dim__k': N_FEATURES_OPTIONS,
}
]
reducer_labels = ['PCA', 'NMF', 'KBest(f_regression)']
grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)
grid.fit(X_all, y_all)
mean_scores = np.array(grid.cv_results_['mean_test_score'])
# scores are in the order of param_grid iteration, which is alphabetical
mean_scores = mean_scores.reshape(1, -1, len(N_FEATURES_OPTIONS))
# select score for best C
mean_scores = mean_scores.max(axis=0)
bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
(len(reducer_labels) + 1) + .5)
plt.figure()
COLORS = 'bgrcmyk'
for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])
plt.title("Comparing feature reduction techniques")
plt.xlabel('Reduced number of features')
plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
plt.ylabel('Digit classification accuracy')
plt.ylim((0, 1))
plt.legend(loc='upper left')
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
pipe = Pipeline([
('reduce_dim', SelectKBest(f_regression)),
('classify', linear_model.LinearRegression(normalize=True))
])
#N_FEATURES_OPTIONS = [80]
N_FEATURES_OPTIONS = [80,100, 150, 200, 250, 300, 350, 400, 450, 500]
MAX_DEPTH = [1, 2, 3]
param_grid = [
# {
# 'reduce_dim': [PCA(iterated_power=7), NMF()],
# 'reduce_dim__n_components': N_FEATURES_OPTIONS,
# 'classify__C': C_OPTIONS
# },
{
'reduce_dim': [SelectKBest(f_regression)],
'reduce_dim__k': N_FEATURES_OPTIONS,
'classify': [linear_model.LinearRegression(normalize=True), linear_model.BayesianRidge(normalize=True), linear_model.Lasso(normalize=True), linear_model.Ridge(normalize=True),linear_model.LassoLars(normalize=True), RandomForestRegressor(max_depth=3, random_state=42)]
#'classify__max_depth': MAX_DEPTH
},
]
reducer_labels = ['Linear', "Bayesian Ridge", "Lasso", "Ridge", "LassoLars", "RandomForest"]
grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)
grid.fit(X_all, y_all)
mean_scores = np.array(grid.cv_results_['mean_test_score'])
print mean_scores
# scores are in the order of param_grid iteration, which is alphabetical
mean_scores = mean_scores.reshape(-1, len(N_FEATURES_OPTIONS))
# select score for best C
#mean_scores = mean_scores.max(axis=0)
bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
(len(reducer_labels) + 1) + .5)
plt.figure()
COLORS = 'bgrcmyk'
for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])
plt.title("Comparing regressions models")
plt.xlabel('Reduced number of features by F Regression')
plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
plt.ylabel('Score')
plt.ylim((0, 1))
plt.legend(loc='upper left')
best_parameters, score, _ = max(grid.grid_scores_, key=lambda x: x[1])
for param_name in sorted(best_parameters.keys()):
print("%s: %r" % (param_name, best_parameters[param_name]))
featureSelection = SelectKBest(f_regression, k=200)
featureSelectionFit = featureSelection.fit(X_all, y_all)
np.set_printoptions(precision=3)
columnsSelection = featureSelectionFit.get_support()
X_all_selection = featureSelectionFit.transform(X_all)
scores = featureSelectionFit.scores_
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kf = KFold(n_splits=3, random_state=42)
for k, (train, test) in enumerate(kf.split(X_all_selection, y_all)):
X_train = X_all_selection[train]
X_test = X_all_selection[test]
y_train = y_all[train]
y_test = y_all[test]
#X_train, X_test, y_train, y_test = train_test_split(X_all_selection, y_all, train_size=0.7, random_state=42)
print "O conjunto de treinamento tem {} amostras.".format(X_train.shape[0])
print "O conjunto de teste tem {} amostras.".format(X_test.shape[0])
model(linear_model.BayesianRidge(), X_train , X_test, y_train, y_test)
pipe = Pipeline([
('classify', linear_model.BayesianRidge(normalize=True))
])
alpha_1 = [1e-06, 3e-06, 6e-06]
alpha_2 = [1e-06, 3e-06, 6e-06]
tol = [1e-03, 3e-03, 6e-03]
fit_intercept = [True, False]
normalize = [True, False]
positive = [True, False]
max_iter = [100,300,500,800,1000]
lambda_1= [1e-06,6e-06, 7e-06,8e-06]
lambda_2= [1e-07, 1e-08, 1e-06]
param_grid = [
{
'classify__fit_intercept': fit_intercept,
'classify__normalize': normalize,
'classify__n_iter': max_iter,
},
]
grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)
grid.fit(X_all_selection, y_all)
best_parameters, score, _ = max(grid.grid_scores_, key=lambda x: x[1])
for param_name in sorted(best_parameters.keys()):
print("%s: %r" % (param_name, best_parameters[param_name]))
regModel = linear_model.BayesianRidge(normalize=True, n_iter=100, fit_intercept=True)
model(regModel, X_train, X_test, y_train, y_test)
dataNoise = data[(data["Country"] == "Brazil")]
#features_multiples + features_normal + features_int
new_features = [] # The list of your K best features
for bool, feature in zip(columnsSelection, X_all):
if bool:
new_features.append(feature)
dataBenchNoise = dataNoise[columns]
#Quantidade total de usuarios do USA
print "Total de usuários : {}".format(len(dataBenchNoise))
for fNull in features_multiples + features_normal:
dataBenchNoise[fNull] = dataBenchNoise[fNull].fillna("Has no opinion")
#Quantidade final
dataBenchNoise = dataBenchNoise.dropna(how='any')
print "Total de usuários depois de retirar nulos: {}".format(len(dataBenchNoise))
X_all_noise = dataBenchNoise[features]
y_all_noise = dataBenchNoise["Salary"]
#Fazendos os dummies
for feature_multiple in features_multiples:
get_dummies_skills_noise(feature_multiple)
for feature_normal in features_normal:
X_all_noise = get_dummies_normal_noise(feature_normal)
X_all_noise["HaveWorkedLanguage_Hack"] = 0
X_all_noise["YearsProgram_18 to 19 years"] = 0
X_all_noise["YearsCodedJob_19 to 20 years"] = 0
X_all_noise["HaveWorkedLanguage_Smalltalk"] = 0
X_all_noise["HomeRemote_Has no opinion"] = 0
X_all_test_noise = pd.concat([X_all_noise[new_features], X_all[new_features]])
y_all_test_noise = pd.concat([y_all_noise, Y_all_origin])
y_all_test_noise = preprocessing.robust_scale(y_all_test_noise.reshape(-1, 1))
X_train_noise, X_test_noise, y_train_noise, y_test_noise = train_test_split(X_all_test_noise, y_all_test_noise, train_size=0.7, random_state=42)
print "O conjunto de treinamento tem {} amostras.".format(X_train.shape[0])
print "O conjunto de teste tem {} amostras.".format(X_test.shape[0])
model(regModel, X_train_noise, X_test_noise, y_train_noise, y_test_noise)
scoresFeatures = dict(zip(X_all.columns, scores))
from operator import itemgetter
import operator as op
sorted(scoresFeatures.items(), key=itemgetter(1))[-10:]
sortedFeatures = sorted(scoresFeatures.items(), key=itemgetter(1))[-10:]
sortedFeaturesDict = dict(sortedFeatures)
x = np.arange(len(sortedFeaturesDict))
y = sortedFeaturesDict.values()
plt.bar(x, y)
plt.title("Top 10 features")
plt.xlabel('Features')
plt.ylabel('Score F_Regression')
plt.xticks(x, sortedFeaturesDict.keys(), rotation='vertical')
plt.show()
sortedFeaturesDict.keys()